Here I study how biomass density changes across treatments in the PatchSizePilot. In particular, I’m studying how biomass density changes across:
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## here() starts at /Users/ema/github/PatchSizePilot
culture_info = read.csv(here("data", "PatchSizePilot_culture_info.csv"), header = TRUE)
load(here("data", "population", "t0.RData")); t0 = pop_output
load(here("data", "population", "t1.RData")); t1 = pop_output
load(here("data", "population", "t2.RData")); t2 = pop_output
load(here("data", "population", "t3.RData")); t3 = pop_output
load(here("data", "population", "t4.RData")); t4 = pop_output
load(here("data", "population", "t5.RData")); t5 = pop_output
load(here("data", "population", "t6.RData")); t6 = pop_output
load(here("data", "population", "t7.RData")); t7 = pop_output
rm(pop_output)
t0$time = NA
t1$time = NA
t0$replicate_video = 1:12 #In t1 I took 12 videos of a single
t1$replicate_video = 1 #In t1 I took only 1 video/culture
t2$replicate_video = 1 #In t2 I took only 1 video/culture
t3$replicate_video = 1 #In t3 I took only 1 video/culture
t4$replicate_video = 1 #In t4 I took only 1 video/culture
t5$replicate_video = 1 #In t5 I took only 1 video/culture
t6$replicate_video = t6$replicate; t6$replicate = NULL
t7$replicate_video = t7$replicate; t7$replicate = NULL
#Create an elongated version of t0 so that each of the 110 cultures can have 12 video replicates at t0.
elongating_t0 = NULL
for (video in 1:nrow(t0)){
for (ID in 1:nrow(culture_info)) {
elongating_t0 = rbind(elongating_t0, t0[video,])
}
}
ID_vector = rep(1:nrow(culture_info), times = nrow(t0))
elongating_t0$culture_ID = ID_vector
t0 = merge(culture_info,elongating_t0, by="culture_ID")
t1 = merge(culture_info,t1,by="culture_ID")
t2 = merge(culture_info,t2,by="culture_ID")
t3 = merge(culture_info,t3,by="culture_ID")
t4 = merge(culture_info,t4,by="culture_ID")
t5 = merge(culture_info,t5,by="culture_ID")
t6 = merge(culture_info,t6,by="culture_ID")
t7 = merge(culture_info,t7,by="culture_ID")
ds = rbind(t0, t1, t2, t3, t4, t5, t6, t7); rm(elongating_t0, t0, t1, t2, t3, t4, t5, t6, t7)
# Switch from data point to day
ds$day = ds$time_point; ds$time_point = NULL
ds$day[ds$day=="t0"] = "0"
ds$day[ds$day=="t1"] = "4"
ds$day[ds$day=="t2"] = "8"
ds$day[ds$day=="t3"] = "12"
ds$day[ds$day=="t4"] = "16"
ds$day[ds$day=="t5"] = "20"
ds$day[ds$day=="t6"] = "24"
ds$day[ds$day=="t7"] = "28"
ds$day = as.numeric(ds$day)
ds = ds %>%
select(culture_ID, patch_size, disturbance, metaecosystem_type, bioarea_per_volume, replicate_video, day, metaecosystem, system_nr, eco_metaeco_type)
ds = ds[, c("culture_ID", "system_nr", "disturbance", "day", "patch_size", "metaecosystem", "metaecosystem_type", "eco_metaeco_type", "replicate_video","bioarea_per_volume")]
ds$eco_metaeco_type = factor(ds$eco_metaeco_type, levels=c('S', 'S (S_S)', 'S (S_L)', 'M', 'M (M_M)', 'L', 'L (L_L)', 'L (S_L)'))
ds_regional = ds %>%
group_by(culture_ID, system_nr, disturbance, day, patch_size, metaecosystem_type) %>%
summarise(patch_mean_bioarea_across_videos = mean(bioarea_per_volume)) %>%
group_by(system_nr, disturbance, day,metaecosystem_type) %>%
summarise(regional_mean_bioarea = mean(patch_mean_bioarea_across_videos))
Let’s now save all the plots together in a single image. As the single image is too large for R markdown, it cannot be displayed here.
grid = grid.arrange(p[[1]],p[[3]],p[[5]],p[[2]],p[[4]],p[[6]],
ncol=3, nrow=2,
top = textGrob("Low disturbance",gp=gpar(fontsize=20,font=3)))
ggsave(here("results", "biomass", "Clean_biomass_low.jpg"), grid, width = 22, height = 13)
Let’s save also here for high disturbance.
biomass_plots("high")
grid = grid.arrange(p[[1]],p[[3]],p[[5]],p[[2]],p[[4]],p[[6]],
ncol=3, nrow=2,
top = textGrob("High disturbance",gp=gpar(fontsize=20,font=3)))
ggsave(here("results", "biomass", "Clean_biomass_high.jpg"), grid, width = 22, height = 13)
To build the mixed effect models we will use the R package lme4. See page 6 of this PDF to know more about the syntaxis of this package.
To do model diagnostics, I’m going to look at the following two plots (as suggested by Zuur et al. (2009), page 487):
When modelling the regional biomass, we want to understand if the regional biomass produced by an ecosystem with a small and a large patch (metaecosystem_type = S_L) is lower than the regional biomass produced by an ecosystem with two medium patches (metaecosystem_type = M_M).
First of all, let’s produce a data set including the regional biomass of all our meta ecosystems. In this data set, not only we want to have the regional biomass of the meta-ecosystems (averaged first across videos and then across patches) but also:
We want to include only the meta-ecosystems in which patches had both medium size (metaecosystem_type = M_M) and meta-ecosystems in which patches had one a small size and the other large size (metaecosystem_type = S_L).
We want to take off the first time point (day = 0). This is because all the meta-ecosystems had the same regional biomass, which was a construct of our experimental design.
ds_regional_shrunk_type_n_day = ds_regional %>%
filter (metaecosystem_type == "M_M" | metaecosystem_type == "S_L", day != 0)
First of all, let’s include time as a random variable. Let’s start from the largest mixed effect model makes sense to construct. This will include:
We are actually going to construct two full models. One with the correlated and one with the uncorrelated random slopes and intercept, the other without.
#Uncorrelated intercepts and slopes
full_model = lmer(regional_mean_bioarea ~ metaecosystem_type + disturbance + metaecosystem_type * disturbance + (metaecosystem_type || day) + (disturbance || day) + (metaecosystem_type*disturbance || day) + (metaecosystem_type || system_nr) + (disturbance || system_nr) + (metaecosystem_type*disturbance || system_nr), data = ds_regional_shrunk_type_n_day, REML = FALSE)
#Correlated intercepts and slopes
full_model_correlated = lmer(regional_mean_bioarea ~ metaecosystem_type + disturbance + metaecosystem_type * disturbance + (metaecosystem_type | day) + (disturbance | day) + (metaecosystem_type*disturbance | day) + (metaecosystem_type | system_nr) + (disturbance | system_nr) + (metaecosystem_type*disturbance | system_nr), data = ds_regional_shrunk_type_n_day, REML = FALSE)
anova(full_model, full_model_correlated)
## Data: ds_regional_shrunk_type_n_day
## Models:
## full_model_correlated: regional_mean_bioarea ~ metaecosystem_type + disturbance + metaecosystem_type * disturbance + (metaecosystem_type | day) + (disturbance | day) + (metaecosystem_type * disturbance | day) + (metaecosystem_type | system_nr) + (disturbance | system_nr) + (metaecosystem_type * disturbance | system_nr)
## full_model: regional_mean_bioarea ~ metaecosystem_type + disturbance + metaecosystem_type * disturbance + ((1 | day) + (0 + metaecosystem_type | day)) + ((1 | day) + (0 + disturbance | day)) + ((1 | day) + (0 + metaecosystem_type | day) + (0 + disturbance | day) + (0 + metaecosystem_type:disturbance | day)) + ((1 | system_nr) + (0 + metaecosystem_type | system_nr)) + ((1 | system_nr) + (0 + disturbance | system_nr)) + ((1 | system_nr) + (0 + metaecosystem_type | system_nr) + (0 + disturbance | system_nr) + (0 + metaecosystem_type:disturbance | system_nr))
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## full_model_correlated 37 2221.3 2330.2 -1073.7 2147.3
## full_model 55 2257.3 2419.1 -1073.7 2147.3 0 18 1
Although the AIC for the model with correlated slopes is lower, there is not statistical significance between the two models. Let’s then keep as a full model the one with non correlated slopes (full_model).
no_system_nr_model = lmer(regional_mean_bioarea ~ metaecosystem_type + disturbance + metaecosystem_type * disturbance + (metaecosystem_type || day) + (disturbance || day) + (metaecosystem_type*disturbance || day), data = ds_regional_shrunk_type_n_day, REML = FALSE)
anova(full_model, no_system_nr_model)
## Data: ds_regional_shrunk_type_n_day
## Models:
## no_system_nr_model: regional_mean_bioarea ~ metaecosystem_type + disturbance + metaecosystem_type * disturbance + ((1 | day) + (0 + metaecosystem_type | day)) + ((1 | day) + (0 + disturbance | day)) + ((1 | day) + (0 + metaecosystem_type | day) + (0 + disturbance | day) + (0 + metaecosystem_type:disturbance | day))
## full_model: regional_mean_bioarea ~ metaecosystem_type + disturbance + metaecosystem_type * disturbance + ((1 | day) + (0 + metaecosystem_type | day)) + ((1 | day) + (0 + disturbance | day)) + ((1 | day) + (0 + metaecosystem_type | day) + (0 + disturbance | day) + (0 + metaecosystem_type:disturbance | day)) + ((1 | system_nr) + (0 + metaecosystem_type | system_nr)) + ((1 | system_nr) + (0 + disturbance | system_nr)) + ((1 | system_nr) + (0 + metaecosystem_type | system_nr) + (0 + disturbance | system_nr) + (0 + metaecosystem_type:disturbance | system_nr))
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## no_system_nr_model 30 2217.1 2305.4 -1078.5 2157.1
## full_model 55 2257.3 2419.1 -1073.7 2147.3 9.7713 25 0.9972
The difference between the two doesn’t seem to be significant. Therefore, let’s throw out the system_nr from our model.
fixed_effects_model = lm(regional_mean_bioarea ~ metaecosystem_type + disturbance, data = ds_regional_shrunk_type_n_day)
anova(no_system_nr_model, fixed_effects_model)
## Data: ds_regional_shrunk_type_n_day
## Models:
## fixed_effects_model: regional_mean_bioarea ~ metaecosystem_type + disturbance
## no_system_nr_model: regional_mean_bioarea ~ metaecosystem_type + disturbance + metaecosystem_type * disturbance + ((1 | day) + (0 + metaecosystem_type | day)) + ((1 | day) + (0 + disturbance | day)) + ((1 | day) + (0 + metaecosystem_type | day) + (0 + disturbance | day) + (0 + metaecosystem_type:disturbance | day))
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## fixed_effects_model 4 2363.1 2374.9 -1177.6 2355.1
## no_system_nr_model 30 2217.1 2305.4 -1078.5 2157.1 198.02 26 < 2.2e-16
##
## fixed_effects_model
## no_system_nr_model ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The full model is better, as we expected. Time plays such an important part in explaining the way that biomass changes.
no_interaction_model = lmer(regional_mean_bioarea ~ metaecosystem_type + disturbance + (metaecosystem_type || day) + (disturbance || day), data = ds_regional_shrunk_type_n_day, REML = FALSE)
anova(no_system_nr_model, no_interaction_model)
## Data: ds_regional_shrunk_type_n_day
## Models:
## no_interaction_model: regional_mean_bioarea ~ metaecosystem_type + disturbance + ((1 | day) + (0 + metaecosystem_type | day)) + ((1 | day) + (0 + disturbance | day))
## no_system_nr_model: regional_mean_bioarea ~ metaecosystem_type + disturbance + metaecosystem_type * disturbance + ((1 | day) + (0 + metaecosystem_type | day)) + ((1 | day) + (0 + disturbance | day)) + ((1 | day) + (0 + metaecosystem_type | day) + (0 + disturbance | day) + (0 + metaecosystem_type:disturbance | day))
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## no_interaction_model 12 2183.2 2218.4 -1079.6 2159.2
## no_system_nr_model 30 2217.1 2305.4 -1078.5 2157.1 2.0436 18 1
There is no statistical significance between the two models. Therefore, let’s drop the interaction between disturbance and meta-ecosystem type.
no_slopes_model = lmer(regional_mean_bioarea ~ metaecosystem_type + disturbance + (1 | day), data = ds_regional_shrunk_type_n_day, REML = FALSE)
anova(no_interaction_model, no_slopes_model)
## Data: ds_regional_shrunk_type_n_day
## Models:
## no_slopes_model: regional_mean_bioarea ~ metaecosystem_type + disturbance + (1 | day)
## no_interaction_model: regional_mean_bioarea ~ metaecosystem_type + disturbance + ((1 | day) + (0 + metaecosystem_type | day)) + ((1 | day) + (0 + disturbance | day))
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## no_slopes_model 5 2185.8 2200.5 -1087.9 2175.8
## no_interaction_model 12 2183.2 2218.4 -1079.6 2159.2 16.598 7 0.02018 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Yes. We should keep the random slopes. Well, it seems like this is our best model then.
After model selection, we determined that the best model is the following one:
regional_mean_bioarea ~ metaecosystem_type + disturbance + (metaecosystem_type || day) + (disturbance || day)
Let’s call it then best_model to don’t get confused.
best_model = no_interaction_model
Let’s now calculate the statistical significance and effect size of
the best model. The marginal and conditional r squared are calculated
using the package MuMIn. For the coding and interpretation
of these r squared check the documentation
for the r.squaredGLMM function.
model.null = lmer(regional_mean_bioarea ~ (1 | day), data = ds_regional_shrunk_type_n_day, REML = FALSE)
r2_best = r.squaredGLMM(best_model)
## Warning: 'r.squaredGLMM' now calculates a revised statistic. See the help page.
r2_best = round(r2_best, digits = 3)
anova = anova(best_model, model.null)
p_best = anova$`Pr(>Chisq)`[2]
p_best = round(p_best, digits = 5)
if (p_best < 0.00001) {
p_best = "< 0.00001"
}
metaecosystem_type_null = lmer(regional_mean_bioarea ~ disturbance + (disturbance || day),
data = ds_regional_shrunk_type_n_day,
REML = FALSE,
control = lmerControl(optimizer ='optimx',
optCtrl=list(method='L-BFGS-B')))
r2_no_metaecosystem_type = r.squaredGLMM(metaecosystem_type_null)
r2_no_metaecosystem_type = round(r2_no_metaecosystem_type, digits = 3)
anova = anova(best_model, metaecosystem_type_null)
p_no_metaecosystem_type = anova$`Pr(>Chisq)`[2]
p_no_metaecosystem_type = round(p_no_metaecosystem_type, digits = 5)
if (p_no_metaecosystem_type < 0.00001) {
p_no_metaecosystem_type = "< 0.00001"
}
disturbance_null = lmer(regional_mean_bioarea ~ metaecosystem_type + (metaecosystem_type || day),
data = ds_regional_shrunk_type_n_day,
REML = FALSE,
control = lmerControl(optimizer ='optimx',
optCtrl=list(method='L-BFGS-B')))
r2_no_disturbance = r.squaredGLMM(disturbance_null)
r2_no_disturbance = round(r2_no_disturbance, digits = 3)
anova = anova(best_model, disturbance_null)
p_no_disturbance = anova$`Pr(>Chisq)`[2]
p_no_disturbance = round(p_no_disturbance, digits = 5)
if (p_no_disturbance < 0.00001) {
p_no_disturbance = "< 0.00001"
}
The effect size and significance of our results can be found in the following table.
| Model | Marginal R2 | Marginal R2 of the best model explained | Conditional R2 | Conditional R2 of the best model explained | P-value |
|---|---|---|---|---|---|
| Best model | 0.077 | / | 0.872 | / | < 0.00001 |
| Best model without meta-ecosystem type | 0.039 | 0.038 | 0.746 | 0.126 | < 0.00001 |
| Best model without disturbance | 0.06 | 0.017 | 0.77 | 0.102 | < 0.00001 |
This means that meta-ecosystem type should explain more than disturbance regional biomass dynamics. However, the effect size isn’t that high (0.038). Because of this, we thought we should fit a function to the time dynamics of the regional biomass.
Here we want to fit how biomass changes across time to a function. The biomass of our meta-ecosystems looks like this.
ds_regional_shrunk_type = ds_regional %>%
filter(metaecosystem_type == "M_M" | metaecosystem_type == "S_L")
ds_regional_shrunk_type%>%
ggplot(aes(x = day,
y = regional_mean_bioarea,
group = day)) +
geom_boxplot() +
labs(x = "day", y = "regional bioarea")
To fit these data, we need to produce (and parameterise) a function that resemble how the biomass first increases and then decreases.
Hank produced the following function:
\[biomass = a_4 * (day - a_5) * e^{a_1(day - a_5)}\]
If we parameterise the function and then plot, it looks as follows.
a1 = -0.1
a4 = 1200
a5 = -1
day = seq(0, 30, 0.01)
biomass = a4*(day-a5) * exp(a1*(day-a5))
plot(biomass ~ day)
Now, let’s find the best parameters (a1, a4, a5) that fit our data.
model = nls(regional_mean_bioarea ~ a4 * (day-a5) * exp(a1 * (day-a5)),
start = list(a1 = -0.1, a4 = 1200, a5 = -1),
trace = T,
data = ds_regional)
## 1.552643e+09 (1.84e+00): par = (-0.1 1200 -1)
## 3.954770e+08 (3.54e-01): par = (-0.1156149 958.8765 -1.825904)
## 3.511665e+08 (4.17e-02): par = (-0.1315285 1084.387 -1.941357)
## 3.505295e+08 (2.97e-03): par = (-0.1352505 1132.741 -1.826744)
## 3.505263e+08 (1.08e-04): par = (-0.1354468 1135.895 -1.824207)
## 3.505263e+08 (5.46e-06): par = (-0.1354567 1136.023 -1.823884)
model$m$getPars()
## a1 a4 a5
## -0.1354567 1136.0225454 -1.8238838
a1 = as.numeric(model$m$getPars()[1])
a4 = as.numeric(model$m$getPars()[2])
a5 = as.numeric(model$m$getPars()[3])
And now let’s plot the function to see how it fits our data.
day = seq(0,30,1)
predicted = a4*(day-a5)*exp(a1*(day-a5))
data_fitted=data.frame(day=day,regional_mean_bioarea=predicted)
ds_regional%>%
ggplot(aes(x = day,
y = regional_mean_bioarea,
group = day)) +
geom_boxplot() +
labs(x = "day", y = "regional bioarea") +
geom_line(data=data_fitted,aes(x = day, y=regional_mean_bioarea),color="red", group = 1)
Let’s try to convert the biomass into its logarithm. So, let’s plot it first.
ds_regional %>%
ggplot(aes(x = day,
y = log(regional_mean_bioarea),
group = day)) +
geom_boxplot()
## Warning: Removed 7 rows containing non-finite values (stat_boxplot).
It seems like the relationship between the logarithm fo regional biomass and time could be linear. If it is, we could just add to our mixed model the intercept and the slope for the logarithm of time.
Let’s see if there is actually a linear relationship between the logarithm of regional biomass and time.
plot(y = ds_regional$regional_mean_bioarea, x = ds_regional$day)
model = lm(regional_mean_bioarea ~ day, data = ds_regional)
abline(model)
summary = summary(model)
summary
##
## Call:
## lm(formula = regional_mean_bioarea ~ day, data = ds_regional)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1782.3 -876.6 -101.6 692.7 2757.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2764.508 74.577 37.07 <2e-16 ***
## day -66.617 4.457 -14.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 966.6 on 558 degrees of freedom
## Multiple R-squared: 0.2859, Adjusted R-squared: 0.2846
## F-statistic: 223.4 on 1 and 558 DF, p-value: < 2.2e-16
anova = anova(model)
par(mfrow=c(2,3))
plot(model, which = 1:5)
par(mfrow=c(1,1))
abline(model)
log_day_intercept = summary$coefficients[1,1]
log_day_slope = summary$coefficients[2,1]
The r squared is 0.2859145 and the p-value 9.8786927^{-43}. The model is as follows:
y = 2764.5078829 + -66.6172399 log(day).
I’m not sure if these residual plots are okay. I better ask Hank.
if I can use this model and output it into the mixed effect model, I would something similar to what follows.
ds_regional$log_day_intercept = log_day_intercept
ds_regional$log_day_slope = log_day_slope
ds_regional_M_M_S_L = ds_regional %>%
filter(metaecosystem_type == "S_L" | metaecosystem_type == "M_M" )
system_nr_no_interaction_no_slopes_model = lmer(regional_mean_bioarea ~ log_day_intercept + log_day_slope * log(day) + metaecosystem_type + disturbance + (1 | system_nr), data = ds_regional_M_M_S_L, REML = FALSE)
The code doesn’t work.
I’m not sure about how to introduce the estimated parameters in the mixed model.
Let’s try to do the same thing but with a square root transformation.
ds_regional %>%
ggplot(aes(x = day,
y = sqrt(regional_mean_bioarea),
group = day)) +
geom_boxplot()
It doesn’t look that good. I’m not even going to try.
plot(full_model)
qqnorm(resid(full_model))
plot(full_model_correlated)
qqnorm(resid(full_model_correlated))
plot(no_system_nr_model)
qqnorm(resid(no_system_nr_model))
par(mfrow=c(2,3))
plot(fixed_effects_model, which = 1:5)
par(mfrow=c(1,1))
plot(no_system_nr_model)
qqnorm(resid(no_system_nr_model))
plot(no_slopes_model)
qqnorm(resid(no_slopes_model))
plot(metaecosystem_type_null)
qqnorm(resid(metaecosystem_type_null))
plot(disturbance_null)
qqnorm(resid(disturbance_null))